Main analysis (Exploratory Data Analysis)
Analysis on Age distribution of our customer
set.seed(321)
index = sample(nrow(data_18_6), round(nrow(data_18_6)*0.001))
sample = data_18_6[index,]
g = ggplot(sample, aes(birth.year,minute))
g + geom_point(alpha = 0.5) + ylab("trip duration (in min)") +
xlab("users' birth year") + ggtitle("Scatterplot of uses' birth year and trip duration (in sec) in Jun 2018")
There are 1.945,611 observations in my data set, so we sampled 1% of them to see the relationship between user’s birth of year and trip durations.
We were expect to see an increasing and then decreasing trend as in the distribution of users’ birth year (right-skewed). However, there is no obvious trend in the scatterplot. ### Heatmap
## get station info
station.info <- data_18_6 %>%
group_by(start.station.id) %>%
summarise(lat=as.numeric(start.station.latitude[1]),
long=as.numeric(start.station.longitude[1]),
name=start.station.name[1],
n.trips=n())
leaflet(station.info) %>%setView(-74.00, 40.71, zoom = 12) %>%
addProviderTiles("CartoDB.Positron") %>%
addHeatmap(lng=~long, lat=~lat, intensity = ~n.trips, radius = 7, gradient = "YlGnBu")
pal = colorNumeric(
palette="YlOrRd",
domain = station.info$n.trips
)
leaflet(station.info) %>%setView(-74.00, 40.71, zoom = 12) %>%
leaflet::addProviderTiles("CartoDB.Positron") %>%
leaflet::addCircles(lng = ~long, lat = ~lat, color =~pal(n.trips)) %>%
leaflet::addLegend("bottomright", pal = pal, value = ~n.trips, title = "Num of Trips (2018.6)")
### Analysis on weather and season #### Join weather data to see if weather has an impact Is it reasonable to assume that weather and season has a huge impact on CitiBike frequency.
NYCweather<-read.csv("./data/NYCweather.csv")
df$Date <- as.Date(df$starttime)
NYCweather$Date<-as.Date(NYCweather$Date)
df_weather <- inner_join(df,NYCweather,by="Date")
df_weather$Severe <- as.factor(df_weather$Severe)
hist(df_weather$Date, breaks="days", freq=TRUE)
ggplot(df_weather, aes(x=Date))+
geom_histogram(colour="white", fill="steelblue")+
ggtitle("Number of trips by day")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df_weather, aes(x=Severe, y=))+
geom_bar(colour="white", fill="steelblue")+
ggtitle("Number of trips per day for severe and unsevere weather condition")
ggplot(data = df_weather,
aes(x = Windspeed, y = minute)) +
scale_x_continuous("Windspeed") +
scale_y_continuous("Average trip duration in minutes") +
ggtitle("Trip duration vs. Windspeed") +
stat_summary(fun.y="mean", geom = "line", colour="steelblue", size=1)
ggplot(data = df_weather,
aes(x = Temperature, y = minute)) +
scale_x_continuous("Temperature") +
scale_y_continuous("Average trip duration in minutes") +
ggtitle("Trip duration vs. Temperature") +
stat_summary(fun.y="mean", geom="line", colour="steelblue", size=1)
ggplot(data = df_weather,
aes(x = Precipitation, y = minute)) +
scale_x_continuous("Precipitation") +
scale_y_continuous("Average trip duration in minutes") +
ggtitle("Trip duration vs. Precipitation") +
stat_summary(fun.y="mean", geom = "line", colour="steelblue", size=1)
ggplot(data = df_weather,
aes(x=factor(Severe), y=minute))+
xlab("Severity of weather")+ylab("Average trip length")+
stat_summary(fun.y="mean", geom="bar")
df_by_temp <- df_weather %>% group_by(Temperature) %>% summarise(n_distinct(starttime))
ggplot(data = df_by_temp, aes(x=as.numeric(Temperature), y=`n_distinct(starttime)`))+
stat_summary(geom="bar", fill="steelblue")+
xlab("Temperature")+
ylab("Average trips per day")+
ggtitle("Average daily trips in different temperatures")
## No summary function supplied, defaulting to `mean_se()
df_by_windspeed <- df_weather %>% group_by(Windspeed) %>% summarise(n_distinct(starttime))
ggplot(data = df_by_windspeed, aes(x=as.numeric(Windspeed), y=`n_distinct(starttime)`))+
stat_summary(geom="bar", fill="steelblue")+
xlab("Windspeed")+
ylab("Average trips per day")+
ggtitle("Average daily trips in different windspeeds")
## No summary function supplied, defaulting to `mean_se()
df_by_precip <- df_weather %>% group_by(Precipitation) %>% summarise(n_distinct(starttime))
ggplot(data = df_by_precip, aes(x=as.numeric(Precipitation), y=`n_distinct(starttime)`))+
stat_summary(geom="bar", fill="steelblue")+
xlab("Precipitation")+
ylab("Average trips per day")+
ggtitle("Average daily trips in different precipitations")
## No summary function supplied, defaulting to `mean_se()
#### Interactive time series to see if season has an impact on customer flow with Dygraph We combine over a year time series to see if season has an impact on customer flow with Dygraph. The data is from 201706 to 201806.
library(dygraphs)
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following object is masked from 'package:leaflet':
##
## addLegend
## The following objects are masked from 'package:dplyr':
##
## first, last
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(zoo)
df_timeseries <- read.csv('./data/data_2017_2018.csv')
a <- lubridate::mdy_hms(df_timeseries$starttime)
## Warning: All formats failed to parse. No formats found.
b <- as.Date(df_timeseries$starttime)
b[is.na(b)] <- a[!is.na(a)]
df_timeseries$starttime <- b
df_timeseries <- df_timeseries %>% mutate(time = format(as.Date(starttime), "%Y-%m"))
df_timeseries <- df_timeseries[,c("time","usertype")]
df_subscriber <- df_timeseries %>% subset(usertype == 'Subscriber')
df_customer <- df_timeseries %>% subset(usertype == 'Customer')
df_subscriber <- df_subscriber %>% group_by(time) %>% summarise(Number = n())
df_customer <- df_customer %>% group_by(time) %>% summarise(Number = n())
ts_subscriber <- xts::xts(df_subscriber$Number,as.Date(as.yearmon(df_subscriber$time)))
ts_customer <- xts::xts(df_customer$Number,as.Date(as.yearmon(df_customer$time)))
ts <- cbind(ts_subscriber,ts_customer)
dygraph(ts,main = 'Impact of season',ylab = "Frequency") %>% dySeries('..1',label = 'subscriber ') %>% dySeries('..2',label = 'customer') %>% dyLegend(show = "always", hideOnMouseOut = FALSE,width=400) %>%
dyOptions(colors = RColorBrewer::brewer.pal(3, "Set2"))
### Analysis about customer flow
library(lubridate)
df_timeslot <- mutate(df,hour = hour(as.POSIXct(starttime)))
df_timeslot <- mutate(df_timeslot,wday = wday(as.POSIXct(starttime)))
df_timeslot <- df_timeslot[,c("hour","wday","minute","start.station.name","usertype")]
timeslot <- ggplot(data=df_timeslot,mapping=aes(as.factor(hour))) +geom_bar(color='Orange')+ggtitle('Time slot in trip data group by usertype')
timeslot + facet_wrap(~usertype)
From the above analysis, the overall time slot has two peaks: in 8-9 am and 5-6 pm, which is reasonable since they are the rush hour, which has a large number of passenger flow.
However, the time slot distribution differs a lot with aspect to different usertype and different weekdays.
Take the above images as examples, the customer has a higher peak in the time slot which is around 4 pm while the subscribers has two peaks.
timeslot <- ggplot(data=df_timeslot,mapping=aes(as.factor(hour))) +geom_bar(color='Orange')+ggtitle('Time slot in trip data group by weekdays')
timeslot + facet_wrap(~as.factor(wday))
Also, the weekdays play a big role in the time duration distribution. On weekdays, we still sees similar two peaks. However, on weekends, we can only observe one peak. Thus, we can think some specific pricing strategy for a specific time slot in the weekdays.